{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import jieba # pip install jieba\n",
    "\n",
    "from tensorflow.keras.layers import Dense, Input, Flatten, Dropout\n",
    "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读入数据\n",
    "neg=pd.read_excel('data/neg.xls',header=None)\n",
    "pos=pd.read_excel('data/pos.xls',header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>做为一本声名在外的流行书，说的还是广州的外企，按道理应该和我的生存环境差不多啊。但是一看之下...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>作者有明显的自恋倾向，只有有老公养不上班的太太们才能像她那样生活。很多方法都不实用，还有抄袭...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>作者完全是以一个过来的自认为是成功者的角度去写这个问题，感觉很不客观。虽然不是很喜欢，但是，...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>作者提倡内调，不信任化妆品，这点赞同。但是所列举的方法太麻烦，配料也不好找。不是太实用。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>作者的文笔一般，观点也是和市面上的同类书大同小异，不推荐读者购买。</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0\n",
       "0  做为一本声名在外的流行书，说的还是广州的外企，按道理应该和我的生存环境差不多啊。但是一看之下...\n",
       "1  作者有明显的自恋倾向，只有有老公养不上班的太太们才能像她那样生活。很多方法都不实用，还有抄袭...\n",
       "2  作者完全是以一个过来的自认为是成功者的角度去写这个问题，感觉很不客观。虽然不是很喜欢，但是，...\n",
       "3       作者提倡内调，不信任化妆品，这点赞同。但是所列举的方法太麻烦，配料也不好找。不是太实用。\n",
       "4                  作者的文笔一般，观点也是和市面上的同类书大同小异，不推荐读者购买。"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10428\n",
      "10677\n"
     ]
    }
   ],
   "source": [
    "#合并语料\n",
    "pn = pd.concat([pos,neg],ignore_index=True) \n",
    "#计算语料数目\n",
    "neglen = len(neg)\n",
    "print(neglen)\n",
    "poslen = len(pos) \n",
    "print(poslen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\DELL\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.765 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "#定义分词函数\n",
    "cw = lambda x: list(jieba.cut(x))\n",
    "pn['words'] = pn[0].apply(cw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...</td>\n",
       "      <td>[做, 父母, 一定, 要, 有, 刘墉, 这样, 的, 心态, ，, 不断, 地, 学习,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...</td>\n",
       "      <td>[作者, 真有, 英国人, 严谨, 的, 风格, ，, 提出, 观点, 、, 进行, 论述,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...</td>\n",
       "      <td>[作者, 长篇大论, 借用, 详细, 报告, 数据处理, 工作, 和, 计算结果, 支持, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...</td>\n",
       "      <td>[作者, 在, 战, 几时, 之前, 用, 了, ＂, 拥抱, ＂, 令人, 叫绝, ．, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...</td>\n",
       "      <td>[作者, 在, 少年, 时即, 喜, 阅读, ，, 能, 看出, 他, 精读, 了, 无数,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21100</th>\n",
       "      <td>安装我自己花了500多，美的够黑心的，真的是烦心，安装的售后叼的要死！差评！！！！！</td>\n",
       "      <td>[安装, 我, 自己, 花, 了, 500, 多, ，, 美的, 够, 黑心, 的, ，, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21101</th>\n",
       "      <td>东西不错，售后太差，安装一个热水器400块钱，像话吗？钱给完了发现被黑了！</td>\n",
       "      <td>[东西, 不错, ，, 售后, 太, 差, ，, 安装, 一个, 热水器, 400, 块钱,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21102</th>\n",
       "      <td>碰到最差的、最骗人的卖家，好吧，我倒霉！这个卖家太不厚道了，显示所在地上海，我买这个热水器选...</td>\n",
       "      <td>[碰到, 最差, 的, 、, 最, 骗人, 的, 卖家, ，, 好, 吧, ，, 我, 倒霉...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21103</th>\n",
       "      <td>宝贝不错，物流也不错，售后差，</td>\n",
       "      <td>[宝贝, 不错, ，, 物流, 也, 不错, ，, 售后, 差, ，]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21104</th>\n",
       "      <td>美的售后太垃圾，其他售后都是两小时回电话，美的是24小时，结果超过市区不到五公里问我收五十，...</td>\n",
       "      <td>[美的, 售后, 太, 垃圾, ，, 其他, 售后, 都, 是, 两, 小时, 回, 电话,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>21105 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                       0  \\\n",
       "0      做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...   \n",
       "1      作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...   \n",
       "2      作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...   \n",
       "3      作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...   \n",
       "4      作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...   \n",
       "...                                                  ...   \n",
       "21100         安装我自己花了500多，美的够黑心的，真的是烦心，安装的售后叼的要死！差评！！！！！   \n",
       "21101              东西不错，售后太差，安装一个热水器400块钱，像话吗？钱给完了发现被黑了！   \n",
       "21102  碰到最差的、最骗人的卖家，好吧，我倒霉！这个卖家太不厚道了，显示所在地上海，我买这个热水器选...   \n",
       "21103                                    宝贝不错，物流也不错，售后差，   \n",
       "21104  美的售后太垃圾，其他售后都是两小时回电话，美的是24小时，结果超过市区不到五公里问我收五十，...   \n",
       "\n",
       "                                                   words  \n",
       "0      [做, 父母, 一定, 要, 有, 刘墉, 这样, 的, 心态, ，, 不断, 地, 学习,...  \n",
       "1      [作者, 真有, 英国人, 严谨, 的, 风格, ，, 提出, 观点, 、, 进行, 论述,...  \n",
       "2      [作者, 长篇大论, 借用, 详细, 报告, 数据处理, 工作, 和, 计算结果, 支持, ...  \n",
       "3      [作者, 在, 战, 几时, 之前, 用, 了, ＂, 拥抱, ＂, 令人, 叫绝, ．, ...  \n",
       "4      [作者, 在, 少年, 时即, 喜, 阅读, ，, 能, 看出, 他, 精读, 了, 无数,...  \n",
       "...                                                  ...  \n",
       "21100  [安装, 我, 自己, 花, 了, 500, 多, ，, 美的, 够, 黑心, 的, ，, ...  \n",
       "21101  [东西, 不错, ，, 售后, 太, 差, ，, 安装, 一个, 热水器, 400, 块钱,...  \n",
       "21102  [碰到, 最差, 的, 、, 最, 骗人, 的, 卖家, ，, 好, 吧, ，, 我, 倒霉...  \n",
       "21103                [宝贝, 不错, ，, 物流, 也, 不错, ，, 售后, 差, ，]  \n",
       "21104  [美的, 售后, 太, 垃圾, ，, 其他, 售后, 都, 是, 两, 小时, 回, 电话,...  \n",
       "\n",
       "[21105 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1804"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 一行数据最多的词汇数\n",
    "max_document_length = max([len(x) for x in pn['words']])\n",
    "max_document_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置一个评论最多1000个词\n",
    "max_document_length = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = [' '.join(x) for x in pn['words']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'宝贝 不错 ， 物流 也 不错 ， 售后 差 ，'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看一条评论\n",
    "texts[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 实例化分词器，设置字典中最大词汇数为30000\n",
    "tokenizer = Tokenizer(num_words=30000)\n",
    "# 传入我们的训练数据，建立词典\n",
    "tokenizer.fit_on_texts(texts) \n",
    "# 把词转换为编号，词的编号根据词频设定，频率越大，编号越小\n",
    "sequences = tokenizer.texts_to_sequences(texts) \n",
    "# 把序列设定为1000的长度，超过1000的部分舍弃，不到1000则补0\n",
    "sequences = pad_sequences(sequences, maxlen=1000, padding='post')  \n",
    "sequences = np.array(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 词对应编号的字典\n",
    "dict_text = tokenizer.word_index\n",
    "dict_text['也']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1318,   24,    1, 1482,    9,   24,    1,  909,  156,    1,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
       "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义标签\n",
    "positive_labels = [[0, 1] for _ in range(poslen)]\n",
    "negative_labels = [[1, 0] for _ in range(neglen)]\n",
    "y = np.concatenate([positive_labels, negative_labels], 0)\n",
    "\n",
    "# 打乱数据\n",
    "np.random.seed(10)\n",
    "shuffle_indices = np.random.permutation(np.arange(len(y)))\n",
    "x_shuffled = sequences[shuffle_indices]\n",
    "y_shuffled = y[shuffle_indices]\n",
    "\n",
    "# 数据集切分为两部分\n",
    "test_sample_index = -1 * int(0.1 * float(len(y)))\n",
    "x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]\n",
    "y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"functional_1\"\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "input_1 (InputLayer)            [(None, 1000)]       0                                            \n",
      "__________________________________________________________________________________________________\n",
      "embedding (Embedding)           (None, 1000, 128)    3840000     input_1[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "conv1d (Conv1D)                 (None, 998, 32)      12320       embedding[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_3 (Conv1D)               (None, 997, 32)      16416       embedding[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_6 (Conv1D)               (None, 996, 32)      20512       embedding[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d (MaxPooling1D)    (None, 199, 32)      0           conv1d[0][0]                     \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_3 (MaxPooling1D)  (None, 199, 32)      0           conv1d_3[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_6 (MaxPooling1D)  (None, 199, 32)      0           conv1d_6[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_1 (Conv1D)               (None, 197, 32)      3104        max_pooling1d[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_4 (Conv1D)               (None, 196, 32)      4128        max_pooling1d_3[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_7 (Conv1D)               (None, 195, 32)      5152        max_pooling1d_6[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_1 (MaxPooling1D)  (None, 39, 32)       0           conv1d_1[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_4 (MaxPooling1D)  (None, 39, 32)       0           conv1d_4[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_7 (MaxPooling1D)  (None, 39, 32)       0           conv1d_7[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_2 (Conv1D)               (None, 37, 32)       3104        max_pooling1d_1[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_5 (Conv1D)               (None, 36, 32)       4128        max_pooling1d_4[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_8 (Conv1D)               (None, 35, 32)       5152        max_pooling1d_7[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_2 (MaxPooling1D)  (None, 1, 32)        0           conv1d_2[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_5 (MaxPooling1D)  (None, 1, 32)        0           conv1d_5[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_8 (MaxPooling1D)  (None, 1, 32)        0           conv1d_8[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "flatten (Flatten)               (None, 32)           0           max_pooling1d_2[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "flatten_1 (Flatten)             (None, 32)           0           max_pooling1d_5[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "flatten_2 (Flatten)             (None, 32)           0           max_pooling1d_8[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "concatenate (Concatenate)       (None, 96)           0           flatten[0][0]                    \n",
      "                                                                 flatten_1[0][0]                  \n",
      "                                                                 flatten_2[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "dense (Dense)                   (None, 128)          12416       concatenate[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "dropout (Dropout)               (None, 128)          0           dense[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "dense_1 (Dense)                 (None, 2)            258         dropout[0][0]                    \n",
      "==================================================================================================\n",
      "Total params: 3,926,690\n",
      "Trainable params: 3,926,690\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "# 定义函数式模型\n",
    "# 模型输入\n",
    "sequence_input = Input(shape=(1000,))\n",
    "# Embedding层，30000表示30000个词，每个词对应的向量为128维，序列长度为1000\n",
    "embedding_layer = Embedding(30000,\n",
    "                            128,\n",
    "                            input_length=1000)\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "\n",
    "# 卷积核大小为3\n",
    "cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedded_sequences)\n",
    "cnn1 = MaxPooling1D(pool_size=5)(cnn1)\n",
    "cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)\n",
    "cnn1 = MaxPooling1D(pool_size=5)(cnn1)\n",
    "cnn1 = Conv1D(filters=32, kernel_size=3, activation='relu')(cnn1)\n",
    "cnn1 = MaxPooling1D(pool_size=37)(cnn1)\n",
    "cnn1 = Flatten()(cnn1)\n",
    "# 卷积核大小为4\n",
    "cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedded_sequences)\n",
    "cnn2 = MaxPooling1D(pool_size=5)(cnn2)\n",
    "cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)\n",
    "cnn2 = MaxPooling1D(pool_size=5)(cnn2)\n",
    "cnn2 = Conv1D(filters=32, kernel_size=4, activation='relu')(cnn2)\n",
    "cnn2 = MaxPooling1D(pool_size=36)(cnn2)\n",
    "cnn2 = Flatten()(cnn2)\n",
    "# 卷积核大小为5\n",
    "cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedded_sequences)\n",
    "cnn3 = MaxPooling1D(pool_size=5)(cnn3)\n",
    "cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)\n",
    "cnn3 = MaxPooling1D(pool_size=5)(cnn3)\n",
    "cnn3 = Conv1D(filters=32, kernel_size=5, activation='relu')(cnn3)\n",
    "cnn3 = MaxPooling1D(pool_size=35)(cnn3)\n",
    "cnn3 = Flatten()(cnn3)\n",
    "\n",
    "# 合并\n",
    "merge = concatenate([cnn1, cnn2, cnn3], axis=1)\n",
    "# 全链接层\n",
    "x = Dense(128, activation='relu')(merge)\n",
    "# Dropout层\n",
    "x = Dropout(0.5)(x)\n",
    "# 输出层\n",
    "preds = Dense(2, activation='softmax')(x)\n",
    "# 定义模型\n",
    "model = Model(sequence_input, preds)\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "149/149 [==============================] - 122s 816ms/step - loss: 0.4747 - acc: 0.7543 - val_loss: 0.2128 - val_acc: 0.9223\n",
      "Epoch 2/5\n",
      "149/149 [==============================] - 127s 849ms/step - loss: 0.1500 - acc: 0.9485 - val_loss: 0.1662 - val_acc: 0.9441\n",
      "Epoch 3/5\n",
      "149/149 [==============================] - 124s 835ms/step - loss: 0.0557 - acc: 0.9849 - val_loss: 0.1904 - val_acc: 0.9393\n",
      "Epoch 4/5\n",
      "149/149 [==============================] - 118s 794ms/step - loss: 0.0225 - acc: 0.9946 - val_loss: 0.2312 - val_acc: 0.9336\n",
      "Epoch 5/5\n",
      " 88/149 [================>.............] - ETA: 51s - loss: 0.0106 - acc: 0.9979"
     ]
    }
   ],
   "source": [
    "# 训练模型\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['acc'])\n",
    "\n",
    "model.fit(x_train, y_train,\n",
    "          batch_size=128,\n",
    "          epochs=5,\n",
    "          validation_data=(x_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 预测\n",
    "def predict(text):\n",
    "    # 对句子分词\n",
    "    cw = list(jieba.cut(text)) \n",
    "    word_id = []\n",
    "    # 把词转换为编号\n",
    "    for word in cw:\n",
    "        try:\n",
    "            temp = dict_text[word]\n",
    "            word_id.append(temp)\n",
    "        except:\n",
    "            word_id.append(0)\n",
    "    word_id = np.array(word_id)\n",
    "    word_id = word_id[np.newaxis,:]\n",
    "    sequences = pad_sequences(word_id, maxlen=1000, padding='post')  \n",
    "    result = np.argmax(model.predict(sequences))\n",
    "    if(result==1):\n",
    "        print(\"positive comment\")\n",
    "    else:\n",
    "        print(\"negative comment\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict(\"东西质量不错，下次还会再来买\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
